Global_fishing_watch_workflow_predict_everywhere

Author

Théophile L. Mouton

Published

October 1, 2024

Combining AIS and Sentinel-1 SAR fishing effort data from Global Fishing Watch

Open R libraries

Code
# Load required libraries
library(tidyverse)
library(tidyr)
library(ggplot2)
library(data.table)
library(gridExtra)
library(maps)
library(raster)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)
library(knitr)
library(kableExtra)
library(future)
library(spdep)
library(ncf)
library(blockCV)
library(parallel)
library(doParallel)
library(sp)

AIS data

Data from Kroodsma et al. (2018) Science, accessible at: Global Fishing Watch Data Download Portal.

The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.

Code
# Set the path to the 2017-2020 folder

#path <- "Data/AIS Fishing Effort 2017-2020"

# List all CSV files in the folder
#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)

# Read all CSV files and combine them into a single data frame
#AIS_fishing <- AIS_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","AIS_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
  group_by(cell_ll_lat, cell_ll_lon) %>%
  summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  # Remove any cells with zero or negative fishing hours
  filter(total_fishing_hours > 0)

# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
  list(
    lon_std = floor(lon * 10) / 10,
    lat_std = floor(lat * 10) / 10
  )
}

# Standardize and aggregate AIS data
AIS_data_std <- aggregated_AIS_fishing %>%
  mutate(coords = map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = AIS_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Sentinel-1 SAR data

Data from Paolo et al. 2024 Nature, accessible at: Global Fishing Watch SAR Vessel Detections

The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.

Code
# Set the path to the 2016 folder
#path <- "Data/SAR Vessel detections 2017-2020"

# List all CSV files in the folder
#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)

# Read all CSV files and combine them into a single data frame
#SAR_fishing <- SAR_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","SAR_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
  mutate(
    lat_rounded = round(lat, digits = 2),
    lon_rounded = round(lon, digits = 2)
  ) %>%
  group_by(lat_rounded, lon_rounded) %>%
  filter(fishing_score >= 0.9) %>%
  summarise(
    total_presence_score = sum(presence_score, na.rm = TRUE),
    avg_fishing_score = mean(fishing_score, na.rm = TRUE),
    count = n()
  ) %>%
  mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
  ungroup()

# Standardize and aggregate SAR data
SAR_data_std <- aggregated_SAR_fishing %>%
  mutate(coords = map2(lon_rounded, lat_rounded, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = SAR_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_presence_score)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Fishing vessel detections (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Compare AIS, and SAR detection locations

Identifying grid cells with only AIS, only SAR detections or both data types.

Code
# Merge the datasets
combined_data <- full_join(
  AIS_data_std,
  SAR_data_std,
  by = c("lon_std", "lat_std")
)

# Categorize each cell
combined_data <- combined_data %>%
  mutate(category = case_when(
    total_fishing_hours > 0 & total_presence_score > 0 ~ "Both AIS and SAR",
    total_fishing_hours > 0 & (is.na(total_presence_score) | total_presence_score == 0) ~ "Only AIS",
    (is.na(total_fishing_hours) | total_fishing_hours == 0) & total_presence_score > 0 ~ "Only SAR",
    TRUE ~ "No fishing detected"
  ))

# Create the world map
world_map <- map_data("world")

# Create the plot
world_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data, 
            aes(x = lon_std, y = lat_std, fill = category)) +
  scale_fill_manual(
    values = c("Both AIS and SAR" = "purple", 
               "Only AIS" = "blue", 
               "Only SAR" = "red", 
               "No fishing detected" = "white"),
    name = "Fishing Data Source",
    guide = guide_legend(title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Detection",
       subtitle = "Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Display the plot
print(world_plot)

Code
# Get summary statistics
summary_stats <- combined_data %>% 
  count(category) %>% 
  mutate(percentage = n / sum(n) * 100) %>%
  rename(`Number of cells` = n) %>%
  mutate(percentage = round(percentage, 2))

# Create and display the table
kable(summary_stats, 
      format = "html", 
      col.names = c("Category", "Number of cells", "Percentage (%)"),
      caption = "Summary Statistics of Data Categories") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, 
                position = "center") %>%
  column_spec(2, width = "150px") %>%
  column_spec(3, width = "150px")
Summary Statistics of Data Categories
Category Number of cells Percentage (%)
Both AIS and SAR 163095 9.12
Only AIS 1566190 87.60
Only SAR 58668 3.28

Random forest model

Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.

Code
# Open the saved adjusted rasters
Ports_adjusted <- raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))
Shore_adjusted <- raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))
Bathy_adjusted <- raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))

# Stack the resampled rasters
raster_stack <- stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)

# Convert the stack to a dataframe
raster_df <- as.data.frame(raster_stack, xy = TRUE)

# Rename the columns
names(raster_df) <- c("x", "y", "dist_shore", "dist_ports", "bathy")

# Remove NA values if desired
raster_df <- na.omit(raster_df)

# Convert to data.table for efficiency
setDT(raster_df)

# Round x and y to 1 decimal place for consistency
raster_df[, `:=`(
  lon_std = round(x, digits = 1),
  lat_std = round(y, digits = 1)
)]

# Keep only ocean areas (negative bathymetry values)
raster_df <- raster_df[bathy < 0]

# Keep the first occurrence of each coordinate pair (because there are duplicates)
raster_df <- unique(raster_df, by = c("lon_std", "lat_std"))
setDT(raster_df)

# Now proceed with the join and model training
load(here::here("data","combined_data_O1deg.Rdata"))

# Keep the first occurrence of each coordinate pair (because there are duplicates)
combined_data_01deg <- unique(combined_data_01deg, by = c("lon_std", "lat_std"))
setDT(combined_data_01deg)

# Perform the join using data.table
combined_data_with_rasters <- raster_df[combined_data_01deg, 
                                        on = .(lon_std, lat_std), 
                                        nomatch = 0]

# Convert back to dataframe if needed
combined_data_with_rasters <- as.data.frame(combined_data_with_rasters)

# Create the new combined dataframe
combined_data_all <- raster_df[, .(lon_std, lat_std, dist_shore, dist_ports, bathy)]

# Perform the join
combined_data_all <- merge(combined_data_all, combined_data_01deg, 
                           by = c("lon_std", "lat_std"), 
                           all.x = TRUE)

# Fill NA values for has_AIS and has_SAR with FALSE
combined_data_all[is.na(has_AIS), has_AIS := FALSE]
combined_data_all[is.na(has_SAR), has_SAR := FALSE]

# Categorize each cell
combined_data_all <- combined_data_all %>%
  mutate(category = case_when(
    has_AIS == TRUE ~ "AIS Data",
    has_SAR == TRUE ~ "SAR Data Only",
    TRUE ~ "No AIS or SAR Data"
  ))

# Create the world map
world_map <- map_data("world")

# Create the plot
world_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data_all, 
            aes(x = lon_std, y = lat_std, fill = category, color = category)) +
  scale_fill_manual(
    values = c("AIS Data" = "blue", 
               "SAR Data Only" = "red", 
               "No AIS or SAR Data" = "black"),
    name = "Data Source",
    guide = guide_legend(title.position = "top", title.hjust = 0.5)
  ) +
  scale_color_manual(
    values = c("AIS Data" = "blue", 
               "SAR Data Only" = "red", 
               "No AIS or SAR Data" = "black"),
    guide = "none"
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Data Coverage",
       subtitle = "Distribution of AIS and SAR data at 0.1-degree resolution",
       x = "Longitude", y = "Latitude") +
 theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10)),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text = element_text(size = 8),
    axis.title = element_text(size = 10),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

# Print the plot
print(world_plot)

Code
# Calculate summary statistics
summary_stats <- combined_data_all[, .(
  data_type = case_when(
    has_AIS == TRUE ~ "AIS Data",
    has_SAR == TRUE ~ "SAR Data Only",
    TRUE ~ "No AIS or SAR Data"
  )
)][, .(
  num_cells = .N,
  percentage = .N / nrow(combined_data_all) * 100
), by = data_type]

# Order the data types
summary_stats <- summary_stats[order(match(data_type, c("AIS Data", "SAR Data Only", "No AIS or SAR Data")))]

# Add total row
total_row <- data.table(
  data_type = "Total",
  num_cells = sum(summary_stats$num_cells),
  percentage = 100
)

summary_stats <- rbindlist(list(summary_stats, total_row))

# Create kable
kable_output <- summary_stats %>%
  kable(
    col.names = c("Data Type", "Number of Cells", "Percentage (%)"),
    digits = c(0, 0, 2),
    align = c("l", "r", "r"),
    caption = "Summary Statistics of Data Types"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  row_spec(nrow(summary_stats), bold = TRUE, background = "#F0F0F0") %>%
  footnote(
    general = "AIS Data category includes cells with both AIS and SAR data.",
    general_title = "Note:",
    footnote_as_chunk = TRUE
  )

# Print the kable
kable_output
Summary Statistics of Data Types
Data Type Number of Cells Percentage (%)
AIS Data 1240429 42.14
SAR Data Only 39915 1.36
No AIS or SAR Data 1663016 56.50
Total 2943360 100.00
Note: AIS Data category includes cells with both AIS and SAR data.
Code
# Prepare the training data
training_data <- combined_data_with_rasters %>%
  filter(has_AIS & has_SAR) %>%
  dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  na.omit()

Comparison of transformations in models

Code
# Prepare the data
#load(here::here("training_data.Rdata"))

#training_data_log <- training_data %>%
#  mutate(
#    log_total_presence_score = log10(total_presence_score + 1),
#    log_total_fishing_hours = log10(total_fishing_hours + 1)
#  )

# Function to run a single model
#run_model <- function(formula, data) {
#  randomForest(
#    formula,
#    data = data,
#    ntree = 500,
#    importance = TRUE
#  )
#}

# Set up parallel processing
#num_cores <- detectCores() - 1  # Use all but one core
#cl <- makeCluster(num_cores)

# Export necessary objects to the cluster
#clusterExport(cl, c("training_data_log", "run_model"))

# Load required packages on each cluster
#clusterEvalQ(cl, library(randomForest))

# Define the models
#models <- list(
#  no_transform = as.formula(total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),
#  original = as.formula(total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),
#  log = as.formula(log_total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy)
#)

# Run models in parallel
#results <- parLapply(cl, models, function(formula) run_model(formula, training_data_log))

# Stop the cluster
#stopCluster(cl)

# Save the models
#rf_model_no_transform <- results[[1]]
#rf_model_original <- results[[2]]
#rf_model_log <- results[[3]]

# Save models to files
#saveRDS(rf_model_no_transform, "rf_model_no_transform.rds")
#saveRDS(rf_model_original, "rf_model_original.rds")
#saveRDS(rf_model_log, "rf_model_log.rds")

# Add the new model with only total_fishing_hours log-transformed
#rf_model_fishing_log <- randomForest(
#  log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,
#  data = training_data_log,
#  ntree = 500,
#  importance = TRUE
#)

# Save the new model
#saveRDS(rf_model_fishing_log, "rf_model_fishing_log.rds")

# Load the saved models
rf_model_no_transform <- readRDS(here::here("rf_model_no_transform.rds"))
rf_model_original <- readRDS(here::here("rf_model_original.rds"))
rf_model_log <- readRDS(here::here("rf_model_log.rds"))
rf_model_fishing_log <- readRDS(here::here("rf_model_fishing_log.rds"))

# Function to evaluate models
evaluate_model <- function(model, data, log_target = FALSE) {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- 10^predictions - 1
  }
  
  actual <- data$total_fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared (matching randomForest's % Var explained)
  r_squared <- model$rsq[length(model$rsq)]
  
  # Adjusted R-squared
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,
    "Root Mean Squared Error" = rmse,
    "Mean Absolute Percentage Error" = mape,
    "Median Absolute Error" = medae,
    "R-Squared" = r_squared,
    "Adjusted R-Squared" = adj_r_squared,
    "Mean of Residuals" = mean_residual,
    "Standard Deviation of Residuals" = sd_residual,
    "Feature Importance" = feature_importance
  ))
}

# Evaluate all models
validation_data <- combined_data_with_rasters %>%
  mutate(
    data_category = case_when(
      has_AIS & has_SAR ~ "Both AIS and SAR",
      has_AIS & !has_SAR ~ "Only AIS",
      !has_AIS & has_SAR ~ "Only SAR",
      TRUE ~ "No fishing detected"
    )
  )

validation_data <- validation_data %>% filter(data_category == "Both AIS and SAR")

# Evaluate all models
results_no_transform <- evaluate_model(rf_model_no_transform, validation_data)

validation_data_logpres <- validation_data %>%
  mutate(log_total_presence_score = log10(total_presence_score + 1))

results_original <- evaluate_model(rf_model_original, validation_data_logpres)

# Add evaluation for the new model (fishing hours log-transformed)
results_fishing_log <- evaluate_model(rf_model_fishing_log, validation_data, log_target = TRUE)

results_log <- evaluate_model(rf_model_log, validation_data_logpres, log_target = TRUE)

# Create a data frame with the results
results_df <- data.frame(
  Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error", "Median Absolute Error", "R-Squared", "Adjusted R-Squared", "Mean of Residuals", "Standard Deviation of Residuals"),
  No_Transform = c(results_no_transform$`Mean Absolute Error`, results_no_transform$`Root Mean Squared Error`, results_no_transform$`Mean Absolute Percentage Error`, results_no_transform$`Median Absolute Error`, results_no_transform$`R-Squared`, results_no_transform$`Adjusted R-Squared`, results_no_transform$`Mean of Residuals`, results_no_transform$`Standard Deviation of Residuals`),
  Fishing_Log = c(results_fishing_log$`Mean Absolute Error`, results_fishing_log$`Root Mean Squared Error`, results_fishing_log$`Mean Absolute Percentage Error`, results_fishing_log$`Median Absolute Error`, results_fishing_log$`R-Squared`, results_fishing_log$`Adjusted R-Squared`, results_fishing_log$`Mean of Residuals`, results_fishing_log$`Standard Deviation of Residuals`),
  Presence_Log = c(results_original$`Mean Absolute Error`, results_original$`Root Mean Squared Error`, results_original$`Mean Absolute Percentage Error`, results_original$`Median Absolute Error`, results_original$`R-Squared`, results_original$`Adjusted R-Squared`, results_original$`Mean of Residuals`, results_original$`Standard Deviation of Residuals`),
  Both_Log = c(results_log$`Mean Absolute Error`, results_log$`Root Mean Squared Error`, results_log$`Mean Absolute Percentage Error`, results_log$`Median Absolute Error`, results_log$`R-Squared`, results_log$`Adjusted R-Squared`, results_log$`Mean of Residuals`, results_log$`Standard Deviation of Residuals`)
)

# Create the kable
kable(results_df, format = "html", digits = 3,
      col.names = c("Metric", "No Transform", "Fishing Hours Log", "Presence Score Log", "Both Log"),
      caption = "Model Performance Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Models" = 4)) %>%
  column_spec(1, bold = TRUE)
Model Performance Comparison
Models
Metric No Transform Fishing Hours Log Presence Score Log Both Log
Mean Absolute Error 160.267 186.463 160.912 186.250
Root Mean Squared Error 737.537 1123.120 742.742 1119.788
Mean Absolute Percentage Error 1228.122 71.127 1242.678 71.200
Median Absolute Error 21.973 10.100 22.012 10.104
R-Squared 0.812 0.824 0.808 0.824
Adjusted R-Squared 0.812 0.824 0.808 0.824
Mean of Residuals -5.937 150.586 -6.438 150.287
Standard Deviation of Residuals 737.516 1112.983 742.718 1109.662

Interpretation of model comparison metrics

Based on the provided performance metrics, I would choose the Fishing Hours Log-Transformed Model. Here’s the reasoning:

  1. R-Squared and Adjusted R-Squared: The Fishing Hours Log model has the highest R-squared (0.8239) and Adjusted R-squared values, indicating it explains the most variance in the data.
  2. Mean Absolute Percentage Error (MAPE): This model has a significantly lower MAPE (69.71%) compared to the No Transform and Presence Log models (both over 1200%). This suggests much better relative accuracy. Median Absolute Error: It has the lowest median absolute error (10.19), which indicates good performance on typical cases.
  3. Root Mean Squared Error (RMSE): While higher than the No Transform model, the difference isn’t as dramatic as the improvement in MAPE.
  4. Mean Absolute Error (MAE): Although higher than No Transform and Presence Log models, this should be considered in context with other metrics.

The Both Log model performs very similarly to the Fishing Hours Log model, but the latter edges it out slightly in most metrics.

The No Transform and Presence Log models, despite having lower MAE and RMSE, have extremely high MAPE values, suggesting they might be making large relative errors, especially on smaller values.

The logarithmic transformation of fishing hours seems to have addressed some issues with the data distribution, leading to more balanced performance across different scales of the target variable.

In conclusion, the Fishing Hours Log-Transformed Model appears to offer the best overall performance, particularly in terms of explained variance and relative error metrics. However, the choice might depend on the specific requirements of your application, such as whether absolute or relative errors are more important in your context.

Selected Model performance

Code
evaluate_model <- function(model, data, log_target = FALSE) {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- 10^predictions - 1
  }
  
  actual <- if (log_target) 10^data$log_total_fishing_hours - 1 else data$total_fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared (matching randomForest's % Var explained)
  r_squared <- model$rsq[length(model$rsq)]
  
  # Adjusted R-squared
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,
    "Root Mean Squared Error" = rmse,
    "Mean Absolute Percentage Error" = mape,
    "Median Absolute Error" = medae,
    "R-Squared" = r_squared,
    "Adjusted R-Squared" = adj_r_squared,
    "Mean of Residuals" = mean_residual,
    "Standard Deviation of Residuals" = sd_residual,
    "Feature Importance" = feature_importance
  ))
}

validation_data <- combined_data_with_rasters %>%
  mutate(
    data_category = case_when(
      has_AIS & has_SAR ~ "Both AIS and SAR",
      has_AIS & !has_SAR ~ "Only AIS",
      !has_AIS & has_SAR ~ "Only SAR",
      TRUE ~ "No fishing detected"
    ),
    log_total_fishing_hours = log10(total_fishing_hours + 1)
  )

# Evaluate the model
rf_model_fishing_log <- readRDS(here::here("rf_model_fishing_log.rds"))
validation_data <- validation_data %>% filter(data_category == "Both AIS and SAR")
results_rf_fishing_log <- evaluate_model(rf_model_fishing_log, validation_data, log_target = TRUE)

# Create a dataframe for the table
results_table <- data.frame(
  Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error",
             "Median Absolute Error", "R-Squared", "Adjusted R-Squared",
             "Mean of Residuals", "Standard Deviation of Residuals"),
  Value = round(c(results_rf_fishing_log$`Mean Absolute Error`,
                  results_rf_fishing_log$`Root Mean Squared Error`,
                  results_rf_fishing_log$`Mean Absolute Percentage Error`,
                  results_rf_fishing_log$`Median Absolute Error`,
                  results_rf_fishing_log$`R-Squared`,
                  results_rf_fishing_log$`Adjusted R-Squared`,
                  results_rf_fishing_log$`Mean of Residuals`,
                  results_rf_fishing_log$`Standard Deviation of Residuals`),
                2)  # Round to 2 decimal places
)

# Create and display the table
kable(results_table, format = "html", digits = 4, caption = "Model Evaluation Metrics for Log-Transformed Fishing Hours Model") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center")
Model Evaluation Metrics for Log-Transformed Fishing Hours Model
Metric Value
Mean Absolute Error 186.46
Root Mean Squared Error 1123.12
Mean Absolute Percentage Error 71.13
Median Absolute Error 10.10
R-Squared 0.82
Adjusted R-Squared 0.82
Mean of Residuals 150.59
Standard Deviation of Residuals 1112.98
Code
# For feature importance, create a separate table
feature_importance <- as.data.frame(results_rf_fishing_log$`Feature Importance`)
feature_importance$Feature <- rownames(feature_importance)
feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]
colnames(feature_importance) <- c("Feature", "%IncMSE", "IncNodePurity")

# Sort the feature importance table by %IncMSE in descending order
feature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]

kable(feature_importance, format = "html", digits = 4, 
      col.names = c("Feature", "%IncMSE", "IncNodePurity"),
      caption = "Feature Importance for Log-Transformed Fishing Hours Model") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, width = "150px")
Feature Importance for Log-Transformed Fishing Hours Model
Feature %IncMSE IncNodePurity
lat_std lat_std 298.3858 23760.26
lon_std lon_std 243.3174 26855.30
bathy bathy 214.4595 28714.10
total_presence_score total_presence_score 189.2026 44184.39
dist_shore dist_shore 136.4095 12774.27
dist_ports dist_ports 99.1768 14170.55

Maps of predictions

Code
# Prepare the prediction data
prediction_data <- combined_data_with_rasters %>%
  dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)

# Make predictions
log_predictions <- predict(rf_model_fishing_log, newdata = prediction_data)

# Back-transform predictions
predictions <- 10^log_predictions - 1

# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
  mutate(
    predicted_fishing_hours = case_when(
      has_AIS ~ total_fishing_hours,
      has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],
      TRUE ~ 0
    )
  )

#Violin plot of observed versus predicted fishing hours 
# Prepare data for plotting
plot_data <- combined_data_01deg %>%
  mutate(
    ais_fishing_hours = if_else(has_AIS, total_fishing_hours, NA_real_),
    sar_predicted_hours = if_else(!has_AIS & has_SAR, predicted_fishing_hours, NA_real_)
  ) %>%
  dplyr::select(ais_fishing_hours, sar_predicted_hours) %>%
  pivot_longer(cols = c(ais_fishing_hours, sar_predicted_hours),
               names_to = "type",
               values_to = "hours") %>%
  filter(!is.na(hours))

# Create violin plot
violin_plot <- ggplot(plot_data, aes(x = type, y = hours, fill = type)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1, fill = "white", color = "black", alpha = 0.5, outlier.shape = NA) +
  scale_y_log10(labels = scales::comma_format(accuracy = 1)) +
  scale_x_discrete(labels = c("ais_fishing_hours" = "AIS Data", 
                              "sar_predicted_hours" = "SAR Predictions")) +
  labs(title = "Comparison of AIS Fishing Hours and SAR Predicted Fishing Hours",
       subtitle = "AIS data for AIS-covered areas, Predictions for SAR-only areas",
       x = "",
       y = "Fishing Hours (log scale)",
       fill = "Type") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(violin_plot)

Code
# Map of predicted fishing hours only 
# Prepare the data for the map
map_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  dplyr::select(lon_std, lat_std, predicted_fishing_hours)

# The map_data now contains back-transformed predictions, so no further transformation is needed

#predicted_SAR_only_1RF=map_data
#save(predicted_SAR_only_1RF, file="predicted_SAR_only_1RF.Rdata")

# Create the world map
world_map <- map_data("world")

# Function to create map for a specific region
create_region_map <- function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {
  ggplot() +
    geom_map(data = world_map, map = world_map,
             aes(long, lat, map_id = region),
             color = "darkgray", fill = "lightgray", size = 0.1) +
    geom_tile(data = data, 
              aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +
    scale_fill_viridis(
      option = "inferno",
      direction = -1,
      trans = "log1p",
      name = "Predicted fishing hours (2017-2020)", 
      breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
      labels = scales::comma,
      guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
    ) +
    coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +
    theme_minimal() +
    labs(title = title,
         subtitle = subtitle,
         x = "Longitude", y = "Latitude") +
    theme(
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "vertical",
      legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
      legend.title = element_text(margin = ggplot2::margin(b = 10))
    )
}

# Global map
predicted_SAR_only_plot <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")

# Caribbean map
caribbean_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")

# Northwestern Indian Ocean to Western European waters map
indian_european_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")

# Asia map
asia_map <- create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")

# Print the maps
print(predicted_SAR_only_plot)

Code
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)

Code
#Map of both original and predicted AIS fishing hours 
# Visualize the results
predicted_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data_01deg, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (0.1 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

print(predicted_plot)

Code
#Aggregate data to 0.5 degree resolution 
# First, round the coordinates to the nearest 0.5 degree
combined_data_05deg <- combined_data_01deg %>%
  mutate(
    lon_05deg = round(lon_std * 2) / 2,
    lat_05deg = round(lat_std * 2) / 2
  )

# Aggregate the data
aggregated_data <- combined_data_05deg %>%
  group_by(lon_05deg, lat_05deg) %>%
  summarise(
    predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
    .groups = 'drop'
  )

#save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))

# Create the map
predicted_plot_05deg <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = aggregated_data, 
            aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Hours (0.5 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Print the map
print(predicted_plot_05deg)

Code
# Caribbean map
caribbean_map <- create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")

# Northwestern Indian Ocean to Western European waters map
indian_european_map <- create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")

# Asia map
asia_map <- create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")

# Print the maps
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)

Test for spatial auto-correlation

Code
##------------ Create a non-random test set 
# Prepare the data
data <- combined_data_with_rasters %>%
  filter(has_AIS & has_SAR) %>%
  dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  na.omit() %>%
  mutate(log_total_fishing_hours = log10(total_fishing_hours + 1))

# Split the data into training and test sets
set.seed(123)  # for reproducibility
train_indices <- sample(1:nrow(data), 0.7 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]

# Train the random forest model (commented out as you're loading a saved model)
# set.seed(123)  # for reproducibility
# rf_timing <- system.time({
#   rf_model_fishing_log_train <- randomForest(
#     log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,
#     data = train_data,
#     ntree = 500,
#     importance = TRUE
#   )
# })
# 
# cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")
# 
# # Save the new model
# save(rf_model_fishing_log_train, file = "rf_model_fishing_log_train.Rdata")

# Load the saved model
#load(here::here("rf_model_fishing_log_train.Rdata"))

rf_model_fishing_log <- readRDS(here::here("rf_model_fishing_log.rds"))

# Make predictions on the test set
test_data$log_predictions <- predict(rf_model_fishing_log, test_data)

# Back-transform predictions and actual values
test_data$predictions <- 10^test_data$log_predictions - 1
test_data$total_fishing_hours_original <- 10^test_data$log_total_fishing_hours - 1

# Calculate residuals
test_data$residuals <- test_data$total_fishing_hours - test_data$predictions

# Prepare spatial data for autocorrelation analysis
test_data_sf <- st_as_sf(test_data, coords = c("lon_std", "lat_std"), crs = 4326)

# Create a neighbors list
k <- 8  # You can adjust this number
nb <- knearneigh(st_coordinates(test_data_sf), k = k)
nb <- knn2nb(nb)

# Create a spatial weights matrix
lw <- nb2listw(nb, style="W")

# Perform Moran's I test on the test set residuals
moran_test <- moran.test(test_data_sf$residuals, lw)

# Create a data frame with the test results
  moran_results <- data.frame(
    Statistic = c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),
    Value = c(
      round(moran_test$estimate[1],2),
      round(moran_test$estimate[2],2),
      round(moran_test$estimate[3],2),
      round(moran_test$statistic,2),
      ifelse(moran_test$p.value < 2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific = TRUE, digits = 4))
    )
  )
  
 # Create and display the table
  kable(moran_results, format = "html", col.names = c("Statistic", "Value"),
        caption = "Moran I Test under randomisation") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = FALSE,
                  position = "center") %>%
    column_spec(1, bold = TRUE) %>%
    column_spec(2, width = "15em")
Moran I Test under randomisation
Statistic Value
Moran I statistic Moran I statistic 0.26
Expectation Expectation 0
Variance Variance 0
Moran I statistic standard deviate Standard deviate 103.7
P-value < 2.2e-16

Spatial cross-validation

Code
# Load the training data and the model
load(here::here("training_data.Rdata"))
rf_model_fishing_log <- readRDS(here::here("rf_model_fishing_log.rds"))

# Convert training_data to sf object if it's not already
if (!inherits(training_data, "sf")) {
  training_data_sf <- st_as_sf(training_data, coords = c("lon_std", "lat_std"), crs = 4326)
} else {
  training_data_sf <- training_data
}

# Perform k-means clustering
set.seed(123)  # for reproducibility
coords <- st_coordinates(training_data_sf)
kmeans_result <- kmeans(coords, centers = 6)

# Add cluster assignments to the sf object
training_data_sf$block <- as.factor(kmeans_result$cluster)

# Define the evaluate_model function
evaluate_model <- function(model, data, log_target = FALSE) {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- 10^predictions - 1
  }
  
  actual <- if (log_target) 10^data$log_total_fishing_hours - 1 else data$total_fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared (matching randomForest's % Var explained)
  r_squared <- model$rsq[length(model$rsq)]
  
  # Adjusted R-squared
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,
    "Root Mean Squared Error" = rmse,
    "Mean Absolute Percentage Error" = mape,
    "Median Absolute Error" = medae,
    "R-Squared" = r_squared,
    "Adjusted R-Squared" = adj_r_squared,
    "Mean of Residuals" = mean_residual,
    "Standard Deviation of Residuals" = sd_residual,
    "Feature Importance" = feature_importance
  ))
}

# Predict for each fold
results_list <- list()
performance_metrics <- list()

for (i in 1:6) {
  # Get the test data for the i-th block
  test_data <- training_data_sf[training_data_sf$block == i, ]
  
  # Extract coordinates
  coords <- st_coordinates(test_data)
  
  # Create a new data frame for prediction
  test_data_df <- data.table(
    total_fishing_hours = test_data$total_fishing_hours,
    log_total_fishing_hours = log10(test_data$total_fishing_hours + 1),
    total_presence_score = test_data$total_presence_score,
    dist_shore = test_data$dist_shore,
    dist_ports = test_data$dist_ports,
    bathy = test_data$bathy,
    lon_std = coords[,"X"],
    lat_std = coords[,"Y"]
  )
  
  # Evaluate the model
  fold_metrics <- evaluate_model(rf_model_fishing_log, test_data_df, log_target = TRUE)
  
  # Store performance metrics
  performance_metrics[[i]] <- fold_metrics
  
  # Make predictions for results_list
  log_predictions <- predict(rf_model_fishing_log, newdata = test_data_df)
  predictions <- 10^log_predictions - 1
  
  # Store results
  results_list[[i]] <- data.table(
    observed = test_data$total_fishing_hours,
    predicted = predictions,
    fold = i,
    lon_std = coords[,"X"],
    lat_std = coords[,"Y"]
  )
}

# Create performance metrics table
performance_table <- data.frame(Metric = character(), 
                                Fold1 = numeric(), 
                                Fold2 = numeric(), 
                                Fold3 = numeric(), 
                                Fold4 = numeric(), 
                                Fold5 = numeric(), 
                                Fold6 = numeric(),
                                stringsAsFactors = FALSE)

for (metric in names(performance_metrics[[1]])) {
  if (metric != "Feature Importance") {
    row <- c(metric, sapply(performance_metrics, function(x) x[[metric]]))
    performance_table <- rbind(performance_table, row)
  }
}

# Convert numeric columns to numeric
performance_table[, 2:7] <- lapply(performance_table[, 2:7], as.numeric)
performance_metrics_names <- c(
  "Mean Absolute Error",
  "Root Mean Squared Error",
  "Mean Absolute Percentage Error",
  "Median Absolute Error",
  "R-Squared",
  "Adjusted R-Squared",
  "Mean of Residuals",
  "Standard Deviation of Residuals"
)
performance_table[,1] <- performance_metrics_names
colnames(performance_table)=c("Metric", "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6")

# Round the values
performance_table_formatted <- performance_table %>%
  mutate(across(Fold1:Fold6, ~ round(., 2)))

# Display the performance table
kable(performance_table_formatted,
      caption = "Model Performance for Each Fold",
      align = c('l', rep('r', 6)),
      format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE)
Model Performance for Each Fold
Metric Fold1 Fold2 Fold3 Fold4 Fold5 Fold6
Mean Absolute Error 677.18 110.26 41.84 44.39 158.75 95.34
Root Mean Squared Error 2991.82 557.18 196.19 300.53 675.18 393.05
Mean Absolute Percentage Error 75.78 75.41 80.52 65.87 61.37 86.08
Median Absolute Error 26.05 5.98 5.66 5.12 11.84 9.65
R-Squared 0.82 0.82 0.82 0.82 0.82 0.82
Adjusted R-Squared 0.82 0.82 0.82 0.82 0.82 0.82
Mean of Residuals 548.04 100.93 36.77 37.30 129.92 80.80
Standard Deviation of Residuals 2941.26 547.99 192.72 298.21 662.57 384.67

Predict fishing hours in areas with no fishing data

Random forest model

Code
# Use combined_data_01deg which has the predicted_fishing_hours column
training_data <- combined_data_01deg %>%
  left_join(raster_df, by = c("lon_std", "lat_std")) %>%
  dplyr::select(predicted_fishing_hours, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  na.omit()

# Log-transform the fishing hours (add a small constant to avoid log(0))
training_data$log_fishing_hours <- log(training_data$predicted_fishing_hours + 1)

library(dplyr)

# Function to round to nearest 0.5
round_to_half <- function(x) {
  round(x * 2) / 2
}

# Aggregate the data to 0.5 degree resolution
training_data_05deg <- training_data %>%
  mutate(
    lon_05deg = round_to_half(lon_std),
    lat_05deg = round_to_half(lat_std)
  ) %>%
  group_by(lon_05deg, lat_05deg) %>%
  summarise(
    predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
    dist_shore = mean(dist_shore, na.rm = TRUE),
    dist_ports = mean(dist_ports, na.rm = TRUE),
    bathy = mean(bathy, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    log_fishing_hours = log(predicted_fishing_hours + 1)
  )

# Rename columns to match original naming convention
training_data_05deg <- training_data_05deg %>%
  rename(lon_std = lon_05deg, lat_std = lat_05deg)

#Split the data into training and test sets
set.seed(123)  # for reproducibility
sample_index <- sample(1:nrow(training_data_05deg), 0.8 * nrow(training_data_05deg))
train_data <- training_data_05deg[sample_index, ]
test_data <- training_data_05deg[-sample_index, ]

#Train the random forest model
#library(randomForest)

#rf_model_env <- randomForest(
#  log_fishing_hours ~ lon_std + lat_std + dist_shore + dist_ports + bathy,
#  data = train_data,
#  ntree = 500,
#  importance = TRUE
#)

# Now, save the model as an RDS file
#saveRDS(rf_model_env, file = "rf_model_env_global_nodata.rds")
rf_model_env <- readRDS("rf_model_env_global_nodata.rds")

Model performance

Code
evaluate_model <- function(model, data, log_target = FALSE, log_column = "log_fishing_hours") {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- exp(predictions) - 1
  }
  
  actual <- if (log_target) exp(data[[log_column]]) - 1 else data$fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared (matching randomForest's % Var explained)
  r_squared <- model$rsq[length(model$rsq)]
  
  # Adjusted R-squared
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,
    "Root Mean Squared Error" = rmse,
    "Mean Absolute Percentage Error" = mape,
    "Median Absolute Error" = medae,
    "R-Squared" = r_squared,
    "Adjusted R-Squared" = adj_r_squared,
    "Mean of Residuals" = mean_residual,
    "Standard Deviation of Residuals" = sd_residual,
    "Feature Importance" = feature_importance
  ))
}

# Evaluate the model on the test set
results_rf_fishing_log <- evaluate_model(rf_model_env, test_data, log_target = TRUE)

# Create a dataframe for the metrics table
results_table <- data.frame(
  Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error",
             "Median Absolute Error", "R-Squared", "Adjusted R-Squared",
             "Mean of Residuals", "Standard Deviation of Residuals"),
  Value = round(c(results_rf_fishing_log$`Mean Absolute Error`,
                  results_rf_fishing_log$`Root Mean Squared Error`,
                  results_rf_fishing_log$`Mean Absolute Percentage Error`,
                  results_rf_fishing_log$`Median Absolute Error`,
                  results_rf_fishing_log$`R-Squared`,
                  results_rf_fishing_log$`Adjusted R-Squared`,
                  results_rf_fishing_log$`Mean of Residuals`,
                  results_rf_fishing_log$`Standard Deviation of Residuals`),
                2)  # Round to 2 decimal places
)

# Format the numbers in the results_table
results_table$Value <- sapply(results_table$Value, function(x) {
  if (abs(x) < 0.01 || abs(x) >= 1e6) {
    format(x, scientific = FALSE, big.mark = ",")
  } else {
    format(round(x, 2), nsmall = 2, big.mark = ",")
  }
})

# Create and display the metrics table
kable(results_table, format = "html", caption = "Model Evaluation Metrics for Log-Transformed Fishing Hours Model", escape = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center")
Model Evaluation Metrics for Log-Transformed Fishing Hours Model
Metric Value
Mean Absolute Error 729.30
Root Mean Squared Error 9,292.07
Mean Absolute Percentage Error 163.96
Median Absolute Error 31.42
R-Squared 0.76
Adjusted R-Squared 0.76
Mean of Residuals 650.98
Standard Deviation of Residuals 9,269.48
Code
# For feature importance, create a separate table
feature_importance <- as.data.frame(results_rf_fishing_log$`Feature Importance`)
feature_importance$Feature <- rownames(feature_importance)
feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]
colnames(feature_importance) <- c("Feature", "%IncMSE", "IncNodePurity")

# Sort the feature importance table by %IncMSE in descending order
feature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]

# Create and display the feature importance table
kable(feature_importance, format = "html", digits = 2, 
      col.names = c("Feature", "%IncMSE", "IncNodePurity"),
      caption = "Feature Importance for Log-Transformed Fishing Hours Model") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, width = "150px")
Feature Importance for Log-Transformed Fishing Hours Model
Feature %IncMSE IncNodePurity
lat_std lat_std 196.88 96300.61
lon_std lon_std 174.38 79182.68
dist_shore dist_shore 156.22 45441.73
dist_ports dist_ports 138.86 54425.31
bathy bathy 119.70 49331.27

Maps of predictions

Code
# Aggregate data to 0.5 degree resolution
prediction_data_env <- raster_df %>%
  dplyr::select(lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  filter(dist_shore > 0) %>%  # Only include ocean areas
  mutate(
    lon_std = floor(lon_std * 2) / 2,  # Round to nearest 0.5
    lat_std = floor(lat_std * 2) / 2   # Round to nearest 0.5
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(
    dist_shore = mean(dist_shore, na.rm = TRUE),
    dist_ports = mean(dist_ports, na.rm = TRUE),
    bathy = mean(bathy, na.rm = TRUE)
  ) %>%
  na.omit()

# Load the random forest model
rf_model_env <- readRDS("rf_model_env_global_nodata.rds")

# Make predictions using the model
log_predictions_env <- predict(rf_model_env, newdata = prediction_data_env)

# Back-transform predictions
predictions_env <- exp(log_predictions_env) - 1

# Add predictions to the aggregated dataframe
prediction_data_env$predicted_fishing_hours <- predictions_env

# Global map
global_map <- create_region_map(
  data = prediction_data_env, 
  world_map = world_map, 
  lon_col = "lon_std", 
  lat_col = "lat_std", 
  lon_range = c(-180, 180),
  lat_range = c(-90, 90),
  title = "Global Fishing Hours",
  subtitle = "0.5 degree resolution, based on AIS data, SAR data and Random Forest predictions using environmental data"
)

# Regional maps
caribbean_map <- create_region_map(
  prediction_data_env, world_map, "lon_std", "lat_std", 
  c(-100, -50), c(0, 40), 
  "Predicted Fishing Hours in the Caribbean", 
  "0.5 degree resolution"
)

indian_european_map <- create_region_map(
  prediction_data_env, world_map, "lon_std", "lat_std", 
  c(-20, 80), c(0, 70), 
  "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", 
  "0.5 degree resolution"
)

asia_map <- create_region_map(
  prediction_data_env, world_map, "lon_std", "lat_std", 
  c(60, 180), c(-20, 60), 
  "Predicted Fishing Hours in Asia", 
  "0.5 degree resolution"
)

# Print regional maps
print(global_map)

Code
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)

Add MPA location as a predictor

Code
# Extract MPA data for each point
# Load the MPA binary raster
mpa_raster <- raster(here::here("Data/mpa_ALL_binary.tif"))
mpa_data <- raster::extract(mpa_raster, training_data_05deg[, c("lon_std", "lat_std")])

# Add MPA data to the training dataset
training_data_05deg$mpa_present <- mpa_data

# Split the data into training and test sets
set.seed(123)  # for reproducibility
sample_index <- sample(1:nrow(training_data_05deg), 0.8 * nrow(training_data_05deg))
train_data <- training_data_05deg[sample_index, ]
test_data <- training_data_05deg[-sample_index, ]

# Train the random forest model including MPA as a predictor

#rf_model_env_mpa <- randomForest(
#  log_fishing_hours ~ lon_std + lat_std + dist_shore + dist_ports + bathy + mpa_present,
#  data = train_data,
#  ntree = 500,
#  importance = TRUE
#)

# Save the new model
#saveRDS(rf_model_env_mpa, file = here::here("rf_model_env_mpa_global.rds"))

Model performance

Code
evaluate_model <- function(model, data, log_target = FALSE, log_column = "log_fishing_hours") {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- exp(predictions) - 1
  }
  
  actual <- if (log_target) exp(data[[log_column]]) - 1 else data$fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared (matching randomForest's % Var explained)
  r_squared <- model$rsq[length(model$rsq)]
  
  # Adjusted R-squared
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    "Mean Absolute Error" = mae,
    "Root Mean Squared Error" = rmse,
    "Mean Absolute Percentage Error" = mape,
    "Median Absolute Error" = medae,
    "R-Squared" = r_squared,
    "Adjusted R-Squared" = adj_r_squared,
    "Mean of Residuals" = mean_residual,
    "Standard Deviation of Residuals" = sd_residual,
    "Feature Importance" = feature_importance
  ))
}

# Evaluate the model on the test set
rf_model_env_mpa <- readRDS(here::here("rf_model_env_mpa_global.rds"))
results_rf_fishing_log <- evaluate_model(rf_model_env_mpa, test_data, log_target = TRUE)

# Create a dataframe for the metrics table
results_table <- data.frame(
  Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error",
             "Median Absolute Error", "R-Squared", "Adjusted R-Squared",
             "Mean of Residuals", "Standard Deviation of Residuals"),
  Value = round(c(results_rf_fishing_log$`Mean Absolute Error`,
                  results_rf_fishing_log$`Root Mean Squared Error`,
                  results_rf_fishing_log$`Mean Absolute Percentage Error`,
                  results_rf_fishing_log$`Median Absolute Error`,
                  results_rf_fishing_log$`R-Squared`,
                  results_rf_fishing_log$`Adjusted R-Squared`,
                  results_rf_fishing_log$`Mean of Residuals`,
                  results_rf_fishing_log$`Standard Deviation of Residuals`),
                2)  # Round to 2 decimal places
)

# Format the numbers in the results_table
results_table$Value <- sapply(results_table$Value, function(x) {
  if (abs(x) < 0.01 || abs(x) >= 1e6) {
    format(x, scientific = FALSE, big.mark = ",")
  } else {
    format(round(x, 2), nsmall = 2, big.mark = ",")
  }
})

# Create and display the metrics table
kable(results_table, format = "html", caption = "Model Evaluation Metrics for Log-Transformed Fishing Hours Model with MPA location", escape = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center")
Model Evaluation Metrics for Log-Transformed Fishing Hours Model with MPA location
Metric Value
Mean Absolute Error 618.03
Root Mean Squared Error 7,980.45
Mean Absolute Percentage Error 135.49
Median Absolute Error 26.86
R-Squared 0.80
Adjusted R-Squared 0.80
Mean of Residuals 515.09
Standard Deviation of Residuals 7,964.01
Code
# For feature importance, create a separate table
feature_importance <- as.data.frame(results_rf_fishing_log$`Feature Importance`)
feature_importance$Feature <- rownames(feature_importance)
feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]
colnames(feature_importance) <- c("Feature", "%IncMSE", "IncNodePurity")

# Sort the feature importance table by %IncMSE in descending order
feature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]

# Create and display the feature importance table
kable(feature_importance, format = "html", digits = 2, 
      col.names = c("Feature", "%IncMSE", "IncNodePurity"),
      caption = "Feature Importance for Log-Transformed Fishing Hours Model with MPA location") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, width = "150px")
Feature Importance for Log-Transformed Fishing Hours Model with MPA location
Feature %IncMSE IncNodePurity
lat_std lat_std 339.29 104313.10
lon_std lon_std 317.89 84089.63
dist_shore dist_shore 164.96 41929.62
dist_ports dist_ports 141.52 51094.03
bathy bathy 121.45 46684.34
mpa_present mpa_present 51.95 3389.27

Maps of predictions

Code
# Load the MPA binary raster
mpa_raster <- raster(here::here("Data/mpa_ALL_binary.tif"))

# Aggregate data to 0.5 degree resolution
prediction_data_env <- raster_df %>%
  dplyr::select(lon_std, lat_std, dist_shore, dist_ports, bathy) %>%
  filter(dist_shore > 0) %>%  # Only include ocean areas
  mutate(
    lon_std = floor(lon_std * 2) / 2,  # Round to nearest 0.5
    lat_std = floor(lat_std * 2) / 2   # Round to nearest 0.5
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(
    dist_shore = mean(dist_shore, na.rm = TRUE),
    dist_ports = mean(dist_ports, na.rm = TRUE),
    bathy = mean(bathy, na.rm = TRUE)
  ) %>%
  na.omit()

# Extract MPA data for each point in the prediction grid
mpa_data <- raster::extract(mpa_raster, prediction_data_env[, c("lon_std", "lat_std")])

# Add MPA data to the prediction dataset
prediction_data_env$mpa_present <- mpa_data

# Make sure there are no NA values in mpa_present column
prediction_data_env <- prediction_data_env %>% 
  mutate(mpa_present = ifelse(is.na(mpa_present), 0, mpa_present))

# Load the random forest model that includes MPA as a predictor
rf_model_env_mpa <- readRDS("rf_model_env_mpa_global.rds")

# Make predictions using the model that includes MPA
log_predictions_env_mpa <- predict(rf_model_env_mpa, newdata = prediction_data_env)

# Back-transform predictions
predictions_env_mpa <- exp(log_predictions_env_mpa) - 1

# Add predictions to the aggregated dataframe
prediction_data_env$predicted_fishing_hours <- predictions_env_mpa
save(prediction_data_env, file=here::here("Data", "predicted_fishing_everywhere_05Deg.Rdata"))

# Global map
global_map <- create_region_map(
  data = prediction_data_env, 
  world_map = world_map, 
  lon_col = "lon_std", 
  lat_col = "lat_std", 
  lon_range = c(-180, 180),
  lat_range = c(-90, 90),
  title = "Global Fishing Hours",
  subtitle = "0.5 degree resolution, based on AIS data, SAR data, MPA presence, and Random Forest predictions using environmental data"
)

# Regional maps
caribbean_map <- create_region_map(
  data = prediction_data_env, 
  world_map = world_map, 
  lon_col = "lon_std", 
  lat_col = "lat_std", 
  lon_range = c(-100, -50), 
  lat_range = c(0, 40), 
  title = "Predicted Fishing Hours in the Caribbean", 
  subtitle = "0.5 degree resolution"
)

indian_european_map <- create_region_map(
  data = prediction_data_env, 
  world_map = world_map, 
  lon_col = "lon_std", 
  lat_col = "lat_std", 
  lon_range = c(-20, 80), 
  lat_range = c(0, 70), 
  title = "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", 
  subtitle = "0.5 degree resolution"
)

asia_map <- create_region_map(
  data = prediction_data_env, 
  world_map = world_map, 
  lon_col = "lon_std", 
  lat_col = "lat_std", 
  lon_range = c(60, 180), 
  lat_range = c(-20, 60), 
  title = "Predicted Fishing Hours in Asia", 
  subtitle = "0.5 degree resolution"
)

# Print maps
print(global_map)

Code
print(caribbean_map)

Code
print(indian_european_map)

Code
print(asia_map)